wine.csv in the data folder.KMeans where n_clusters = 3 and compare the clusters to the Wine column.KMeans and Hierarchical Clustering using data from PCA and compare again the clusters to the Wine column.
In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
%matplotlib inline
np.set_printoptions(suppress= True)
In [106]:
wine = pd.read_csv('../data/wine.csv')
In [107]:
wine.tail()
Out[107]:
In [108]:
wine.Wine = wine.Wine - 1
In [109]:
y = wine.Wine
In [110]:
X = wine.iloc[:,1:]
In [111]:
kmeans = KMeans(n_clusters = 3, random_state = 1)
Y_hat_kmeans = kmeans.fit(X).labels_
In [112]:
plt.scatter(X.ix[:,0], X.ix[:,1], c = Y_hat_kmeans, s = X.ix[:,4]*2)
Out[112]:
In [113]:
print confusion_matrix(Y_hat_kmeans, y)
plt.matshow(confusion_matrix(Y_hat_kmeans, y))
plt.title('confusion matrix')
plt.xlabel('Y_hat_kmeans')
plt.ylabel('actual values')
plt.colorbar()
Out[113]:
In [113]:
In [114]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
In [115]:
X_scale = preprocessing.scale(X)
comp = np.arange(14)
explained_var = []
for i in comp:
pca = PCA(n_components= i)
X_pca = pca.fit_transform(X_scale)
explained_var.append(pca.explained_variance_ratio_.sum())
plt.plot(comp, explained_var)
Out[115]:
In [116]:
pca.explained_variance_ratio_
Out[116]:
In [117]:
comp = np.arange(13) + 1
explained_var = []
for i in comp:
pca = PCA(n_components= i)
X_pca = pca.fit_transform(X)
explained_var.append(pca.explained_variance_ratio_.sum())
plt.plot(comp, explained_var)
Out[117]:
In [118]:
print pca.explained_variance_ratio_
In [119]:
pca = PCA(n_components=4)
X_pca = pca.fit_transform(X)
In [120]:
Y_hat_kmeans = kmeans.fit(X_pca).labels_
In [121]:
plt.scatter(X_pca[:,0],X_pca[:,1], c = Y_hat_kmeans)
plt.colorbar()
Out[121]:
In [122]:
# compute distance matrix
from scipy.spatial.distance import pdist, squareform
distx = squareform(pdist(X_pca, metric='euclidean'))
distx
Out[122]:
In [123]:
kmeans = KMeans(n_clusters = 3, random_state = 1)
Y_hat_kmeans = kmeans.fit(X_pca).labels_
In [124]:
print confusion_matrix(Y_hat_kmeans, y)
plt.matshow(confusion_matrix(Y_hat_kmeans, y))
plt.title('confusion matrix')
plt.xlabel('Y_hat_kmeans')
plt.ylabel('actual values')
plt.colorbar()
Out[124]:
In [124]:
In [124]:
In [124]:
In [124]:
In [ ]: